/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: November 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta
			phat.dta
			
Outputs:	Tables 4, E7
			Figures 7, 8, E7, E8

Purpose: Test whether structural genomics groups care less about competition

*******************************************************************************/

* Assemble data, keeping SG researchers
clear all
use "${data_built}pdb_full_structures.dta"
	
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge == 3
drop _merge

* Generate interaction
generate nonSG = (structuralGenomics == 0)
gen interaction = predPtileCites3 * nonSG

/*------------------------------------------------------------------------------

	Table 4: Effect of Potential on Maturation and Quality by SG Status

------------------------------------------------------------------------------*/

* Initialize matrix to store results
matrix SG_main_results = J(22,2,.)
local row = 1
local col = 1

* Regressions for maturation, quality
foreach y in maturation indexStd {
	
	* Main regression, year FEs
	reg `y' predPtileCites3 nonSG interaction $time_controls, r
		matrix SG_main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = _b[nonSG]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[nonSG]
		local ++row
		local t = _b[nonSG] / _se[nonSG]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = _b[interaction]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[interaction]
		local ++row
		local t = _b[interaction] / _se[interaction]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = e(r2)
		local ++row
		
	* Main regression, year FEs + complexity controls
	reg `y' predPtileCites3 nonSG interaction $complexity_controls 			///
	$time_controls, r
		matrix SG_main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = _b[nonSG]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[nonSG]
		local ++row
		local t = _b[nonSG] / _se[nonSG]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = _b[interaction]
		local ++row
		matrix SG_main_results[`row',`col'] = _se[interaction]
		local ++row
		local t = _b[interaction] / _se[interaction]
		matrix SG_main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_main_results[`row',`col'] = e(r2)
		local ++row
		sum `y'
		matrix SG_main_results[`row',`col'] = r(mean)
		local ++row
		matrix SG_main_results[`row',`col'] = r(N)
		local row = 1
		local ++col
}

preserve
	clear
	svmat SG_main_results
	export excel "${tables}Tables.xlsx", sheet(table4) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Table E7: Effect of Potential on Quality by SG Status (Additional Outcomes)

------------------------------------------------------------------------------*/

* Initialize matrix to store results
matrix SG_add_results = J(22,3,.)
local row = 1
local col = 1

* Regressions for maturation, quality
foreach y in resolutionStd rFreeStd ramaRawStd {
	
	* Main regression, year FEs
	reg `y' predPtileCites3 nonSG interaction $time_controls, r
		matrix SG_add_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = _b[nonSG]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[nonSG]
		local ++row
		local t = _b[nonSG] / _se[nonSG]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = _b[interaction]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[interaction]
		local ++row
		local t = _b[interaction] / _se[interaction]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = e(r2)
		local ++row
		
	* Main regression, year FEs + complexity controls
	reg `y' predPtileCites3 nonSG interaction $complexity_controls 			///
	$time_controls, r
		matrix SG_add_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = _b[nonSG]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[nonSG]
		local ++row
		local t = _b[nonSG] / _se[nonSG]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = _b[interaction]
		local ++row
		matrix SG_add_results[`row',`col'] = _se[interaction]
		local ++row
		local t = _b[interaction] / _se[interaction]
		matrix SG_add_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix SG_add_results[`row',`col'] = e(r2)
		local ++row
		sum `y'
		matrix SG_add_results[`row',`col'] = r(mean)
		local ++row
		matrix SG_add_results[`row',`col'] = r(N)
		local row = 1
		local ++col
}

preserve
	clear
	svmat SG_add_results
	export excel "${tables}Tables.xlsx", sheet(tableE7) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Figure 7: Effect of Potential on Maturation by SG Status
	Figure 8: Effect of Potential on Quality by SG Status

------------------------------------------------------------------------------*/

* Loop over outcomes
foreach y in maturation indexStd {
	
	* Run binscatters, grab relevant parts of the stored command
	binscatter `y' predPtileCites3 if structuralGenomics == 0, 				///
	controls($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
		local temp1 = e(graphcmd)
		tokenize "`temp1'", parse("(")
		local temp2 = "`3'"
		local temp3 = "`9'"
		local temp4 = "`11'"
		tokenize "`temp2'", parse(",")
		local scatter1_`y' = "`1'"
		tokenize "`temp3'", parse(",")
		local line1_`y' = "`1'"
		tokenize "`temp4'", parse(")")
		local range1_`y' = "`1'"
	
	binscatter `y' predPtileCites3 if structuralGenomics == 1, 				///
	controls($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
		local temp1 = e(graphcmd)
		tokenize "`temp1'", parse("(")
		local temp2 = "`3'"
		local temp3 = "`9'"
		local temp4 = "`11'"
		tokenize "`temp2'", parse(",")
		local scatter2_`y' = "`1'"
		tokenize "`temp3'", parse(",")
		local line2_`y' = "`1'"
		tokenize "`temp4'", parse(")")
		local range2_`y' = "`1'"
}

	twoway (`scatter1_maturation', mcolor(navy%30)) 						///
		(`line1_maturation', range(`range1_maturation') lcolor(navy)) 		///
		(`scatter2_maturation', mcolor(maroon%30) msymbol(D)) 				///
		(`line2_maturation', range(`range2_maturation') lcolor(maroon)), 	///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Maturation" "(years between collection and deposition)") 	///
		xlab(20(20)80) ylab(0.5(0.5)2) 										///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) 		
	graph save "${figures}figure7.gph", replace
	graph export "${figures}figure7.pdf", replace
		
	 twoway (`scatter1_indexStd', mcolor(navy%30)) 							///
		(`line1_indexStd', range(`range1_indexStd') lcolor(navy)) 			///
		(`scatter2_indexStd', mcolor(maroon%30) msymbol(D)) 				///
		(`line2_indexStd', range(`range2_indexStd') lcolor(maroon)), 		///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Quality" "Standardized quality index") 						///
		xlab(20(20)80) ylab(-0.6(0.3)0.6) 									///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) 
	graph save "${figures}figure8.gph", replace
	graph export "${figures}figure8.pdf", replace
	
/*------------------------------------------------------------------------------

	Figure E7: Potential Distributions by SG Status

------------------------------------------------------------------------------*/

twoway (hist predPtileCites3 if structuralGenomics == 0, start(0) width(5) 	///
	fcolor(navy%30) lcolor(navy%30)) 										///
	(hist predPtileCites3 if structuralGenomics == 1, start(0) width(5) 	///
	fcolor(none) lcolor(black)), 											///
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Density")														///
	legend(order(1 "Non-SG structures" 2 "SG structures") position(6) row(1)) 

graph save "${figures}figureE7.gph", replace
graph export "${figures}figureE7.pdf", replace

/*------------------------------------------------------------------------------

	Figure E8: Efect of Potential on Quality by SG Status (Additional Outcomes)

------------------------------------------------------------------------------*/

* Loop over outcomes
foreach y in resolutionStd rFreeStd ramaRawStd indexStd {
	
	* Run binscatters, grab relevant parts of the stored command
	binscatter `y' predPtileCites3 if structuralGenomics == 0, 				///
	controls($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
		local temp1 = e(graphcmd)
		tokenize "`temp1'", parse("(")
		local temp2 = "`3'"
		local temp3 = "`9'"
		local temp4 = "`11'"
		tokenize "`temp2'", parse(",")
		local scatter1_`y' = "`1'"
		tokenize "`temp3'", parse(",")
		local line1_`y' = "`1'"
		tokenize "`temp4'", parse(")")
		local range1_`y' = "`1'"
	
	binscatter `y' predPtileCites3 if structuralGenomics == 1, 				///
	controls($time_controls)
	* Access the binscatter graph command and save the relevant parts as locals
		local temp1 = e(graphcmd)
		tokenize "`temp1'", parse("(")
		local temp2 = "`3'"
		local temp3 = "`9'"
		local temp4 = "`11'"
		tokenize "`temp2'", parse(",")
		local scatter2_`y' = "`1'"
		tokenize "`temp3'", parse(",")
		local line2_`y' = "`1'"
		tokenize "`temp4'", parse(")")
		local range2_`y' = "`1'"
}

	 twoway (`scatter1_resolutionStd', mcolor(navy%30)) 					///						
		(`line1_resolutionStd', range(`range1_resolutionStd') lcolor(navy)) ///
		(`scatter2_resolutionStd', mcolor(maroon%30) msymbol(D)) 			///
		(`line2_resolutionStd', range(`range2_resolutionStd')  				///
		lcolor(maroon)), title("Panel A: Standardized resolution")			///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Quality") xlab(20(20)80) ylab(-0.6(0.3)0.6)					///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) name(binscatter_resolutionStd)

	 twoway (`scatter1_rFreeStd', mcolor(navy%30)) 							///
		(`line1_rFreeStd', range(`range1_rFreeStd') lcolor(navy)) 			///
		(`scatter2_rFreeStd', mcolor(maroon%30) msymbol(D)) 				///
		(`line2_rFreeStd', range(`range2_rFreeStd') lcolor(maroon)), 		///
		title("Panel B: Standardized R-free")								///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Quality") xlab(20(20)80) ylab(-0.6(0.3)0.6) 				///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) name(binscatter_rFreeStd)
	 
	 twoway (`scatter1_ramaRawStd', mcolor(navy%30)) 						///
		(`line1_ramaRawStd', range(`range1_ramaRawStd') lcolor(navy)) 		///
		(`scatter2_ramaRawStd', mcolor(maroon%30) msymbol(D)) 				///
		(`line2_ramaRawStd', range(`range2_ramaRawStd') lcolor(maroon)), 	///
		title("Panel C: Standardized Ramachandran outliers")				///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Quality") xlab(20(20)80) ylab(-0.6(0.3)0.6)  				///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) name(binscatter_ramaRawStd)

	 twoway (`scatter1_indexStd', mcolor(navy%30)) 							///
		(`line1_indexStd', range(`range1_indexStd') lcolor(navy)) 			///
		(`scatter2_indexStd', mcolor(maroon%30) msymbol(D)) 				///
		(`line2_indexStd', range(`range2_indexStd') lcolor(maroon)), 		///
		title("Panel D: Standardized quality index")						///
		xtitle("Potential" "Predicted three-year citation percentile") 		///
		ytitle("Quality") xlab(20(20)80) ylab(-0.6(0.3)0.6) 				///
		legend(order(1 "Non-SG structures" 3 "SG structures") 				///
		position(6) row(1)) name(binscatter_indexStd)
		
	grc1leg binscatter_resolutionStd binscatter_rFreeStd 					///
	binscatter_ramaRawStd binscatter_indexStd, ysize(5) iscale(*.9)
	
	graph save "${figures}figureE8.gph", replace
	graph export "${figures}figureE8.pdf", replace
	


	
